Data Cleaning
#import data
library(readr)
games_data <- read_csv("vgsales-12-4-2019-short.csv",
col_types = cols(Rank = col_integer())) #change Rank variable to integer class
#change different variables to factor class
games_data$Genre <- as.factor(games_data$Genre)
games_data$ESRB_Rating <- as.factor(games_data$ESRB_Rating)
games_data$Platform <- as.factor(games_data$Platform)
games_data$Publisher <- as.factor(games_data$Publisher)
games_data$Developer <- as.factor(games_data$Developer)
games_data$Year <- as.factor(games_data$Year)
#rename the levels of the ESRB Rating variable and order them
#KA (Kids to Adults) rating was changed to Everyone
library(plyr)
games_data$ESRB_Rating <- revalue(games_data$ESRB_Rating, c("E"="Everyone", "E10"="Everyone10+", "T"="Teen", "M"="Mature", "AO" = "AdultsOnly", "RP"="RatingPending", "KA"="Everyone", "EC"="EarlyChildhood"))
games_data$ESRB_Rating <- ordered(games_data$ESRB_Rating, levels = c("EarlyChildhood", "Everyone", "Everyone10+", "Teen", "Mature", "RatingPending", "AdultsOnly"))
#Change levels of Publisher and Developer variables called Unknown to missing
levels(games_data$Publisher)[levels(games_data$Publisher)=='Unknown'] <- NA
levels(games_data$Developer)[levels(games_data$Developer)=='Unknown'] <- NA
#Rename a couple Publisher and Developer levels
games_data$Publisher <- revalue(games_data$Publisher,
c("Microsoft Game Studios"="Microsoft Studios",
"Valve"="Valve Corporation", "Valve Software"="Valve Corporation", #all Valve Corporation
"Sony Computer Entertainment"="Sony Interactive Entertainment")) #all Sony Interactive Entertainment
games_data$Developer <- revalue(games_data$Developer, c("Microsoft Game Studios"="Microsoft Studios", "Valve"="Valve Corporation", "Valve Software"="Valve Corporation", "Sony Computer Entertainment"="Sony Interactive Entertainment"))
#Rename the sub-genres to their overall Genre
games_data$Genre <- revalue(games_data$Genre, c("Platform"="Action", "Shooter"="Action", "Fighting"="Action", "Visual Novel"="Adventure", "Racing"="Sports"))
#order genres by similarities for graphing
games_data$Genre <- ordered(games_data$Genre, levels = c("Action", "Action-Adventure", "Adventure", "Role-Playing", "Simulation", "Strategy", "Sports", "Board Game", "Puzzle", "Education", "Music", "MMO", "Party", "Misc"))
attach(games_data)
summary(games_data)
## Rank Name Genre ESRB_Rating
## Min. : 1 Length:55792 Action :17783 Everyone :10814
## 1st Qu.:13949 Class :character Misc : 9476 Teen : 6157
## Median :27896 Mode :character Sports : 8274 Mature : 3314
## Mean :27896 Adventure : 5553 Everyone10+ : 2897
## 3rd Qu.:41844 Role-Playing: 4551 RatingPending: 368
## Max. :55792 (Other) :10146 (Other) : 73
## NA's : 9 NA's :32169
## Platform Publisher Developer
## PC :10978 Sega : 2085 Konami : 911
## PS2 : 3564 Activision : 1519 Sega : 817
## DS : 3292 Ubisoft : 1519 Capcom : 684
## PS : 2703 Electronic Arts: 1498 Namco : 425
## XBL : 2115 Konami : 1495 SNK Corporation: 403
## PSN : 2004 (Other) :42785 (Other) :47779
## (Other):31136 NA's : 4891 NA's : 4773
## Critic_Score User_Score Total_Shipped Global_Sales
## Min. : 1.00 Min. : 2.00 Min. : 0.03 Min. : 0.00
## 1st Qu.: 6.40 1st Qu.: 7.80 1st Qu.: 0.20 1st Qu.: 0.03
## Median : 7.50 Median : 8.50 Median : 0.59 Median : 0.12
## Mean : 7.21 Mean : 8.25 Mean : 1.89 Mean : 0.37
## 3rd Qu.: 8.30 3rd Qu.: 9.10 3rd Qu.: 1.80 3rd Qu.: 0.36
## Max. :10.00 Max. :10.00 Max. :82.86 Max. :20.32
## NA's :49256 NA's :55457 NA's :53965 NA's :36377
## NA_Sales PAL_Sales JP_Sales Other_Sales
## Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00
## 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00
## Median :0.12 Median :0.04 Median :0.05 Median :0.01
## Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04
## 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04
## Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12
## NA's :42828 NA's :42603 NA's :48749 NA's :40270
## Year
## 2009 : 4507
## 2010 : 3661
## 2011 : 3489
## 2008 : 2979
## 2014 : 2905
## (Other):37272
## NA's : 979
Genre vs. ESRB Rating
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#create subset where missing values in Genre and ESRB_Rating are omitted
data1 <- games_data[complete.cases(games_data$Genre, games_data$ESRB_Rating),] #23619 observations
#plot bar chart of Genre stacked by Rating
plot1 <- ggplot(data1, aes(x=Genre, fill=ESRB_Rating, stat="count"))+
xlab("Genre")+
ylab("Count")+
ggtitle("Video Game Genres vs. ESRB Rating")+
geom_bar(width=0.75)+
theme(axis.text.x = element_text(angle = 85, vjust=0.5),
plot.title = element_text(hjust=0.5))
plot1

Top 100 Ranked Video Games
#Create subset with top 100 games
top100 <- games_data[games_data$Rank < 101,]
#Top 100 ESRB Ratings
rating_100 <- ggplot(data=subset(top100, !is.na(ESRB_Rating)), aes(x=ESRB_Rating, fill=ESRB_Rating, stat="count"))+
xlab("ESRB Rating")+
ylab("Number of Games")+
ggtitle("Top 100 Video Game Ratings")+
geom_text(stat="count",aes(label=..count..), vjust = -.5) +
theme(legend.position = "none", axis.text = element_text(size=12),
plot.title = element_text(hjust=0.5))+
geom_bar(width=0.75)
#Top 100 Genres
genre_100 <- ggplot(top100, aes(x=Genre, stat="count"))+
xlab("Genre")+
ylab("Number of Games")+
ggtitle("Top 100 Video Game Genres")+
geom_bar(width=0.75, fill="#F8766D")+
geom_text(stat="count",aes(label=..count..), vjust = 0) +
theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.95, hjust = 0.95),
plot.title = element_text(hjust=0.5))
#Top 100 Years
year_100 <- ggplot(top100, aes(x=Year, stat="count"))+
xlab("Year")+
ylab("Number of Games")+
ggtitle("Top 100 Video Game Years")+
geom_bar(width=0.75, fill="#C77CFF")+
geom_text(stat="count",aes(label=..count..), vjust = -.5) +
theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.5),
plot.title = element_text(hjust=0.5))
#Top 100 Publishers
pub_100 <- ggplot(data=subset(top100, !is.na(Publisher)), aes(x=Publisher, stat="count"))+
xlab("Publisher")+
ylab("Number of Games")+
ggtitle("Top 100 Video Game Publisher")+
geom_bar(width=0.75, fill="#00BFC4")+
geom_text(stat="count",aes(label=..count..), vjust = 0) +
theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.95, hjust = 0.95),
plot.title = element_text(hjust=0.5))
#Top 100 Critic Scores
critic_100 <- ggplot(data=subset(top100, !is.na(Critic_Score)))+ #without missing values
geom_histogram(mapping=aes(x=Critic_Score), binwidth=0.1, fill="#7CAE00", colour="grey")+
xlab("Critic Score")+
ylab("Number of Games")+
ggtitle("Top 100 Video Game Critic Scores")+
theme(plot.title = element_text(hjust=0.5))
rating_100

genre_100

year_100

pub_100

critic_100

Critic Score vs. User Score
#create dataset without missing values in Critic or User score
data2 <- games_data[complete.cases(games_data$Critic_Score, games_data$User_Score),] #218 observations
#plot Critic score vs. User Score colored by ESRB Rating in plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot3 <- plot_ly(data=data2, x = ~Critic_Score, y = ~User_Score, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot3 <- plot3 %>% layout(title = "Critic Score vs. User Score",
xaxis = list(title = "Critic Score"),
yaxis = list(title = "User Score"))
plot3
#correlation between Critic and User Scores
cor(data2$Critic_Score, data2$User_Score)
## [1] 0.5826729
Critic Score and User Score vs. Sales
#create column in data that combines Total_Shipped and Global_Sales
#every observation with missing values in Total_shipped has value in Global_Sales and vice versa.
#Rank is by Total_Shipped and when there is a missing value, it goes by Global_Sales
games_data$total <- coalesce(games_data$Total_Shipped, games_data$Global_Sales)
#create subset of data without missing data in Critic_Score and total sales
data3 <- games_data[complete.cases(games_data$Critic_Score, games_data$total),] #4861 observations
#create subset of data without missing data in User_Score and total sales
data4 <- games_data[complete.cases(games_data$User_Score, games_data$total),] #242 observations
#plot Critic Score vs Total Sales, colored by ESRB Rating
plot4 <- plot_ly(data=data3, x = ~Critic_Score, y = ~total, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot4 <- plot4 %>% layout(title = "Critic Score vs. Total Sales",
xaxis = list(title = "Critic Score"),
yaxis = list(title = "Total Sales (Millions)"))
#plot Critic Score vs Total Sales, colored by ESRB Rating
plot5 <- plot_ly(data=data4, x = ~User_Score, y = ~total, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot5 <- plot5 %>% layout(title = "User Score vs. Total Sales",
xaxis = list(title = "User Score"),
yaxis = list(title = "Total Sales (Millions)"))
plot4
#correlation of Critic Score and Total Sales
cor(data3$Critic_Score, data3$total)
## [1] 0.2595252
plot5
#correlation of User Score and Total Sales
cor(data4$User_Score, data4$total)
## [1] 0.160098